# importing the data
library(readxl) # import
library(tidyverse) # import
# visualisation
library(ggplot2) # ggplot()
library(ggpubr) # ggarrange()
library(ggfortify)
library(kableExtra)
# data manipulation
library(rcompanion) # tukeyTransform()
# library(psych) #principal()
# library(esquisse)
theme_set(theme_bw()) # set ggplot to b/w
# rm(list = ls())
base_dir <- "~/Documents/repos/dnajc6-larval-behaviour1"
Importing data:
source(file.path(base_dir, "scripts/import.R"))
raw_7 <- read_data("11-Oct-22") %>% subset(.$fish_id %in% grep("2.10", .$fish_id, value = T))
raw_11 <- read_data("11-Oct-22") %>% subset(.$fish_id %in% grep("6.10", .$fish_id, value = T))
raw_12 <- read_data("12-Oct-22") %>% subset(.$fish_id %in% grep("7.10", .$fish_id, value = T))
raw_all <- full_join(raw_11, raw_12)
Zuur - Analysis of Ecological Data
Questions to answer: 1. Where are the data centered? How are they spread? Are they symmetric, skewed, bimodal? 2. Are there outliers? 3. Are the variables normally distributed> 4. Are there any relationships between the variables? Are relationships between the variables linear? Which follow-up analysis should be applied? 5. Do we need a transformation? 6. Was the sampling effort approximately the same for each observation or variable?
There are 7 explanatory and 25 response variables.
variables <- data.frame(
Name = colnames(raw_all),
Variable = c(rep("Explanatory", 7), rep("Response", 25)),
Type = c(rep("Nominal", 3), rep("Ordinal", 4), rep("Continuous", 6), "Discrete", rep("Continuous", 3), "Discrete", rep("Continuous", 14))
)
colnames(raw_all)
## [1] "fish_id" "Genotype" "Position"
## [4] "Tray" "Trial" "Time"
## [7] "Bin" "Dist_travelled_total" "Dist_travelled_mean"
## [10] "Dist_travelled_var" "Velocity_mean" "Velocity_var"
## [13] "Time_moving_total" "Freq_moving" "Acceleration_max"
## [16] "Acceleration_min" "Acceleration_var" "Freq_in_middle"
## [19] "Time_in_middle_total" "Time_in_middle_mean" "Time_in_middle_var"
## [22] "Dist_to_middle_total" "Dist_to_middle_mean" "Dist_to_middle_var"
## [25] "Mobility_total" "Mobility_mean" "Mobility_var"
## [28] "Meander_total" "Meander_mean" "Meander_var"
## [31] "Heading_mean" "Heading_var"
Here I’m going to visualise each continuous variable to see the distribution of them.
feats <- grep("Dist_travelled", colnames(raw_all), value = T)
do.call(cbind, lapply(raw_all[,c(feats)], summary)) %>%
kable() %>%
kable_styling()
| Dist_travelled_total | Dist_travelled_mean | Dist_travelled_var | |
|---|---|---|---|
| Min. | 0.00000 | 0.0000000 | 0.0000000 |
| 1st Qu. | 4.44462 | 0.0029631 | 0.0008255 |
| Median | 42.16390 | 0.0281468 | 0.0138008 |
| Mean | 74.55129 | 0.0497490 | 0.0341725 |
| 3rd Qu. | 125.50250 | 0.0836811 | 0.0427373 |
| Max. | 4203.34000 | 2.8097200 | 17.5365000 |
source(file.path(base_dir, "scripts/DE functions.R"))
lambdas <- rbind(get_lambda(raw_11, feats), get_lambda(raw_12, feats))
lambdas %>% `rownames<-`(c("11 Oct", "12 Oct")) %>% kable() %>%
kable_styling()
| Dist_travelled_total | Dist_travelled_mean | Dist_travelled_var | |
|---|---|---|---|
| 11 Oct | 0.250 | 0.250 | 0.200 |
| 12 Oct | 0.325 | 0.325 | 0.225 |
# dt_all <- select(raw_all, 1:7, all_of(feats)) %>%
# mutate_at(feats[1], function(x){log(x + 1)}) %>%
# mutate_at(feats[2], function(x){x^0.25}) %>%
# mutate_at(feats[3], function(x){x^0.2})
dt_11 <- select(raw_11, 1:7, all_of(feats)) %>%
mutate_at(feats[1], function(x){log(x + 1)}) %>%
mutate_at(feats[2], function(x){x^0.25}) %>%
mutate_at(feats[3], function(x){x^0.2})
dt_12 <- select(raw_12, 1:7, all_of(feats)) %>%
mutate_at(feats[1], function(x){log(x + 1)}) %>%
mutate_at(feats[2], function(x){x^0.325}) %>%
mutate_at(feats[3], function(x){x^0.225})
plots_11 <- lapply(feats, plot_histbox, data = dt_11)
plots_12 <- lapply(feats, plot_histbox, data = dt_12)
plots <- c(rbind(plots_11, plots_12))
ggarrange(plotlist = plots, ncol = 2)
## $`1`
##
## $`2`
##
## $`3`
##
## attr(,"class")
## [1] "list" "ggarrange"
The original plots showed an extreme right skew for the total, mean,
and variance of the distance travelled (mm). A log(x+1) transformation
made the distribution of total distance travelled much less skewed, and
bimodal. The other two (mean and variance) were still severely right
skewed with a log transformation, so I used transformTukey() to perform
Tukey’s Ladder of Powers. As this function takes a maximum sample size
of 5,000, I ran it separately for the 11 and 12 Oct data. The mean
distance travelled was ^0.25 and the variance
^0.2.
Now I’m going to look at lattice graphs for each of the explanatory variables: genotype, (well) position, tray, trial/time, and bin.
plots_11 <- lapply(feats, plot_histbox, data = dt_11, wrap = "Genotype")
plots_12 <- lapply(feats, plot_histbox, data = dt_12, wrap = "Genotype")
plots <- c(rbind(plots_11, plots_12))
plots
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
plots_11 <- lapply(feats, plot_histbox, data = dt_11, wrap = "Tray")
plots_12 <- lapply(feats, plot_histbox, data = dt_12, wrap = "Tray")
plots <- c(rbind(plots_11, plots_12))
plots
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
plots_11 <- lapply(feats, plot_histbox, data = dt_11, wrap = "Trial")
plots_12 <- lapply(feats, plot_histbox, data = dt_12, wrap = "Trial")
plots <- c(rbind(plots_11, plots_12))
plots
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
plots_11 <- lapply(feats, plot_histbox, data = dt_11, wrap = "Bin")
plots_12 <- lapply(feats, plot_histbox, data = dt_12, wrap = "Bin")
plots <- c(rbind(plots_11, plots_12))
plots
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
The plots wrapped by time bin show that the bimodality is due to the change in movement over time. Due to this, I will be separating the raw data into movement in the light (time bins 1, 2, 3, 4, and 5) and in the dark (time bins 6, 7, 8, 9, and 10).
light_bins <- c(1, 2, 3, 4, 5)
raw_light_11 <- raw_11 %>% subset(Bin %in% light_bins)
raw_light_12 <- raw_12 %>% subset(Bin %in% light_bins)
lambdas <- rbind(get_lambda(raw_light_11, feats), get_lambda(raw_light_12, feats))
lambdas %>% `rownames<-`(c("11 Oct", "12 Oct")) %>% kable() %>%
kable_styling()
| Dist_travelled_total | Dist_travelled_mean | Dist_travelled_var | |
|---|---|---|---|
| 11 Oct | 0.200 | 0.200 | 0.15 |
| 12 Oct | 0.225 | 0.225 | 0.15 |
dtlight_11 <- select(raw_light_11, 1:7, all_of(feats)) %>%
mutate_at(feats[1], function(x){x^0.2}) %>%
mutate_at(feats[2], function(x){x^0.2}) %>%
mutate_at(feats[3], function(x){x^0.15})
dtlight_12 <- select(raw_light_12, 1:7, all_of(feats)) %>%
mutate_at(feats[1], function(x){x^0.225}) %>%
mutate_at(feats[2], function(x){x^0.225}) %>%
mutate_at(feats[3], function(x){x^0.15})
plots_11 <- lapply(feats, plot_histbox, data = dtlight_11)
plots_12 <- lapply(feats, plot_histbox, data = dtlight_12)
ggarrange(plotlist = plots_11, ncol = 3)
ggarrange(plotlist = plots_12, ncol = 3)
plots_11 <- lapply(feats, plot_histbox, data = dtlight_11, wrap = "Genotype")
plots_12 <- lapply(feats, plot_histbox, data = dtlight_12, wrap = "Genotype")
plots <- c(rbind(plots_11, plots_12))
ggarrange(plotlist = plots_11, ncol = 3)
ggarrange(plotlist = plots_12, ncol = 3)
dark_bins <- c(6, 7, 8, 9, 10)
raw_dark_11 <- raw_11 %>% subset(Bin %in% dark_bins)
raw_dark_12 <- raw_12 %>% subset(Bin %in% dark_bins)
lambdas <- rbind(get_lambda(raw_dark_11, feats), get_lambda(raw_dark_12, feats))
lambdas %>% `rownames<-`(c("11 Oct", "12 Oct")) %>% kable() %>%
kable_styling()
| Dist_travelled_total | Dist_travelled_mean | Dist_travelled_var | |
|---|---|---|---|
| 11 Oct | 0.475 | 0.475 | 0.400 |
| 12 Oct | 0.475 | 0.475 | 0.175 |
dtdark_11 <- select(raw_dark_11, 1:7, all_of(feats)) %>%
mutate_at(feats[1], function(x){x^0.2}) %>%
mutate_at(feats[2], function(x){x^0.2}) %>%
mutate_at(feats[3], function(x){x^0.15})
dtdark_12 <- select(raw_dark_12, 1:7, all_of(feats)) %>%
mutate_at(feats[1], function(x){x^0.225}) %>%
mutate_at(feats[2], function(x){x^0.225}) %>%
mutate_at(feats[3], function(x){x^0.15})
plots_11 <- lapply(feats, plot_histbox, data = dtdark_11)
plots_12 <- lapply(feats, plot_histbox, data = dtdark_12)
ggarrange(plotlist = plots_11, ncol = 3)
ggarrange(plotlist = plots_12, ncol = 3)
plots_11 <- lapply(feats, plot_hist, data = dtdark_11, wrap = "Genotype")
plots_12 <- lapply(feats, plot_hist, data = dtdark_12, wrap = "Genotype")
plots <- c(rbind(plots_11, plots_12))
ggarrange(plotlist = plots_11, ncol = 3)
ggarrange(plotlist = plots_12, ncol = 3)
# get_summary <- function(data) {
# do.call(cbind, lapply(data[,c(feats)], summary))
# }
#
# dtdark_geno_11 <- group_split(dtdark_11, Genotype)
#
# lapply(dtdark_geno_11, get_summary) %>%
# kable() %>% kable_styling()
Pairplots to see the relationship between all variables.
# variances <- colnames(raw_12)[grep("var", colnames(raw_12))]
#
# pvars <- colnames(raw_12)[!colnames(raw_12) %in% variances]
#
# pairs_12 <- pairs(raw_12[,pvars])
# mydata <- read.csv("Fish_data.csv", header = TRUE)
#
#
#
# mydata_dark <- mydata %>%
# dplyr::filter(Bin == 5| Bin == 6 | Bin == 7 | Bin == 8 | Bin == 9)
#
# mydata_dark$Genotype <- factor(mydata_dark$Genotype, levels = c("wt", "het", "hom"))
#
# mydata_dark$Exclude <- 0
# mydata_dark$Exclude[mydata_dark$Locomotor_dark > 3] <- 1
#
# mydata_dark <- mydata_dark %>%
# dplyr::filter(Exclude == 0)
#
# ####PCA's####
# #Locomotor Score
#
#
# pca1 <- principal(mydata_dark[,c(8:17, 25:27 ),],nfactors=1,rotate="none")
# pca1
# mydata_dark$Locomotor_dark <- pca1$scores
#
# mydata_dark$Locomotor_dark <- as.numeric(mydata_dark$Locomotor_dark)
#
# summary(mydata_dark$Locomotor_dark)
#
# ggplot(mydata_dark) +
# aes(x = Genotype, y = Locomotor_dark, fill = Genotype) +
# geom_boxplot(shape = "circle") +
# scale_fill_brewer(palette = "Blues", direction = 1) +
# labs(y = "Locomotor Score", title = "Dark Condition") +
# theme_minimal() +
# theme(legend.position = "none")
#
#
# # Anxiety Score
#
# pca2 <- principal(mydata_dark[,c(19:24 ),],nfactors=1,rotate="none")
# pca2
# mydata_dark$Anxiety_dark <- pca2$scores
#
# mydata_dark$Anxiety_dark <- as.numeric(mydata_dark$Anxiety_dark)
# summary(mydata_dark$Anxiety_dark)
#
# tukey <- mydata_dark %>% tukey_hsd(Anxiety_dark ~ Genotype) %>% adjust_pvalue() %>% add_xy_position(x = "Genotype")
#
#
# ggboxplot(mydata_dark, x = "Genotype", y = "Anxiety_dark", scales = "fixed", fill = "Genotype") +
# scale_fill_brewer(palette = "Greys",
# direction = 1) +
# theme_minimal() +
# labs(x = "Genotype", y = "Anxiety Score") +
# theme(axis.title.y = element_text(size = 11L,
# face = "bold"), axis.title.x = element_text(size = 11L, face = "bold"),
# plot.title = element_text(size = 14L, face = "bold"), legend.position = "none") +
# ylim(-3, 4.75) +
# stat_pvalue_manual(tukey, label = "p.adj.signif", tip.length = 0.01, bracket.nudge.y = -2,
# y.position = c(4.25, 4.70, 4.25))
#
#
# ####Logistic Regression - Glm or polr####
#
#
# model <- polr(as.factor(Genotype) ~ Anxiety_dark + Locomotor_dark + Trial, mydata_dark, Hess = TRUE)
#
# tab_model(model)
#
# polr
#
#
# #New CSV
#
# write.csv(mydata_dark, "Dark_PCA_Data.csv")